if(!is.null(dev.list())) dev.off() # Clear plots
rm(list=ls()) # Clean workspace
We load the necessary packages and set working directory
setwd("~/Documents/uni/FIB-ADEI-LAB/deliverable3")
filepath<-"~/Documents/uni/FIB-ADEI-LAB/deliverable3"
#setwd("C:/Users/Claudia Sánchez/Desktop/FIB/TARDOR 2020-2021/ADEI/DELIVERABLE1/FIB-ADEI-LAB/deliverable2")
#filepath<-"C:/Users/Claudia Sánchez/Desktop/FIB/TARDOR 2020-2021/ADEI/DELIVERABLE1/FIB-ADEI-LAB/deliverable2"
# Load Required Packages
options(contrasts=c("contr.treatment","contr.treatment"))
requiredPackages <- c("missMDA","chemometrics","mvoutlier","effects","FactoMineR","car","lmtest","ggplot2","moments","factoextra","RColorBrewer","dplyr","ggmap","ggthemes","knitr")
missingPackages <- requiredPackages[!(requiredPackages %in% installed.packages()[,"Package"])]
if(length(missingPackages)) install.packages(missingPackages)
lapply(requiredPackages, require, character.only = TRUE)
load(paste0(filepath,"/Taxi5000_del2.RData"))
#load("C:/Users/Claudia Sánchez/Desktop/FIB/TARDOR 2020-2021/ADEI/DELIVERABLE1/FIB-ADEI-LAB/deliverable3/Taxi5000_del2.RData")
summary(df)
## VendorID RateCodeID Pickup_longitude Pickup_latitude
## f.Vendor-Mobile : 973 Rate-1 :4496 Min. :-74.02 Min. :40.58
## f.Vendor-VeriFone:3650 Rate-Other: 127 1st Qu.:-73.96 1st Qu.:40.70
## Median :-73.94 Median :40.75
## Mean :-73.93 Mean :40.75
## 3rd Qu.:-73.92 3rd Qu.:40.80
## Max. :-73.80 Max. :40.86
## Dropoff_longitude Dropoff_latitude Passenger_count Trip_distance
## Min. :-74.02 Min. :40.58 Min. :1.000 Min. : 0.010
## 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:1.000 1st Qu.: 1.010
## Median :-73.94 Median :40.75 Median :1.000 Median : 1.760
## Mean :-73.93 Mean :40.75 Mean :1.371 Mean : 2.724
## 3rd Qu.:-73.91 3rd Qu.:40.79 3rd Qu.:1.000 3rd Qu.: 3.400
## Max. :-73.80 Max. :40.86 Max. :6.000 Max. :30.000
## Fare_amount Extra MTA_tax Tip_amount Tolls_amount
## Min. : 1.00 Min. :0.0000 No : 119 Min. : 0.000 Min. :0.0000
## 1st Qu.: 6.00 1st Qu.:0.0000 Yes:4504 1st Qu.: 0.000 1st Qu.:0.0000
## Median : 9.00 Median :0.5000 Median : 0.000 Median :0.0000
## Mean :11.61 Mean :0.3523 Mean : 1.022 Mean :0.0477
## 3rd Qu.:14.50 3rd Qu.:0.5000 3rd Qu.: 1.700 3rd Qu.:0.0000
## Max. :60.00 Max. :1.0000 Max. :17.000 Max. :5.5400
## improvement_surcharge Total_amount Payment_type Trip_type
## No : 118 Min. : 0.00 Credit card:2096 Street-Hail:4511
## Yes:4505 1st Qu.: 7.80 Cash :2497 Dispatch : 112
## Median : 10.80 No paid : 30
## Mean : 13.93
## 3rd Qu.: 17.00
## Max. :128.76
## hour period tlenkm traveltime
## Min. : 0.0 Period night :1642 Min. : 1.000 Min. : 0.000
## 1st Qu.: 9.0 Period morning : 542 1st Qu.: 1.609 1st Qu.: 5.767
## Median :15.0 Period valley :1260 Median : 2.800 Median : 9.550
## Mean :13.4 Period afternoon:1179 Mean : 4.385 Mean :12.487
## 3rd Qu.:19.0 3rd Qu.: 5.472 3rd Qu.:16.125
## Max. :23.0 Max. :48.280 Max. :60.000
## espeed pickup dropoff Trip_distance_range
## Min. : 3.00 Length:4623 Length:4623 Long_dist : 666
## 1st Qu.:14.83 Class :character Class :character Medium_dist: 986
## Median :18.56 Mode :character Mode :character Short_dist :2971
## Mean :20.34
## 3rd Qu.:23.58
## Max. :55.00
## TipIsGiven passenger_groups paidTolls hcpck claKM
## No :2882 Couple: 345 No :4580 kHP-1:1930 kKM-3: 844
## Yes:1741 Group : 395 Yes: 43 kHP-2:1634 kKM-5:2353
## Single:3883 kHP-3: 262 kKM-2: 486
## kHP-4: 758 kKM-1: 831
## kHP-5: 39 kKM-4: 109
##
## f.cost f.tt hcpckMCA hcpckMCA_hcpck hcpckMCA_claKM
## (11,18] :1188 (10,15]: 913 1: 30 kHPmca-4: 43 kHPmca-2:1620
## (18,30] : 724 (15,20]: 549 2:1620 kHPmca-3:2813 kHPmca-3:2813
## (30,50] : 221 (20,60]: 756 3:2813 kHPmca-2:1620 kHPmca-1: 30
## (50,129): 63 (5,10] :1511 4: 43 kHPmca-1: 30 kHPmca-4: 43
## (8,11] :1151 [0,5] : 894 5: 117 kHPmca-5: 117 kHPmca-5: 117
## [0,8] :1276
names(df)
## [1] "VendorID" "RateCodeID" "Pickup_longitude"
## [4] "Pickup_latitude" "Dropoff_longitude" "Dropoff_latitude"
## [7] "Passenger_count" "Trip_distance" "Fare_amount"
## [10] "Extra" "MTA_tax" "Tip_amount"
## [13] "Tolls_amount" "improvement_surcharge" "Total_amount"
## [16] "Payment_type" "Trip_type" "hour"
## [19] "period" "tlenkm" "traveltime"
## [22] "espeed" "pickup" "dropoff"
## [25] "Trip_distance_range" "TipIsGiven" "passenger_groups"
## [28] "paidTolls" "hcpck" "claKM"
## [31] "f.cost" "f.tt" "hcpckMCA"
## [34] "hcpckMCA_hcpck" "hcpckMCA_claKM"
names(df)[names(df) == "VendorID"] <- "f.vendor_id"
names(df)[names(df) == "RateCodeID"] <- "f.code_rate_id"
names(df)[names(df) == "Pickup_longitude"] <- "q.pickup_longitude"
names(df)[names(df) == "Pickup_latitude"] <- "q.pickup_latitude"
names(df)[names(df) == "Dropoff_longitude"] <- "q.dropoff_longitude"
names(df)[names(df) == "Dropoff_latitude"] <- "q.dropoff_latitude"
names(df)[names(df) == "Passenger_count"] <- "q.passenger_count"
names(df)[names(df) == "Trip_distance"] <- "q.trip_distance"
names(df)[names(df) == "Fare_amount"] <- "q.fare_amount"
names(df)[names(df) == "Extra"] <- "q.extra"
names(df)[names(df) == "MTA_tax"] <- "f.mta_tax"
names(df)[names(df) == "Tip_amount"] <- "q.tip_amount"
names(df)[names(df) == "Tolls_amount"] <- "q.tolls_amount"
names(df)[names(df) == "improvement_surcharge"] <- "f.improvement_surcharge"
names(df)[names(df) == "Total_amount"] <- "target.total_amount"
names(df)[names(df) == "Payment_type"] <- "f.payment_type"
names(df)[names(df) == "Trip_type"] <- "f.trip_type"
names(df)[names(df) == "hour"] <- "q.hour"
names(df)[names(df) == "period"] <- "f.period"
names(df)[names(df) == "tlenkm"] <- "q.tlenkm"
names(df)[names(df) == "traveltime"] <- "q.traveltime"
names(df)[names(df) == "espeed"] <- "q.espeed"
names(df)[names(df) == "pickup"] <- "qual.pickup"
names(df)[names(df) == "dropoff"] <- "qual.dropoff"
names(df)[names(df) == "Trip_distance_range"] <- "f.trip_distance_range"
names(df)[names(df) == "paidTolls"] <- "f.paid_tolls"
names(df)[names(df) == "TipIsGiven"] <- "target.tip_is_given"
names(df)[names(df) == "passenger_groups"] <- "f.passenger_groups"
#names(df)[names(df) == "f.cost"] <- ""
#names(df)[names(df) == "f.tt"] <- ""
df$hcpck <- NULL
df$claKM <- NULL
df$hcpckMCA <- NULL
df$hcpckMCA_hcpck <- NULL
df$hcpckMCA_claKM <- NULL
names(df)
## [1] "f.vendor_id" "f.code_rate_id"
## [3] "q.pickup_longitude" "q.pickup_latitude"
## [5] "q.dropoff_longitude" "q.dropoff_latitude"
## [7] "q.passenger_count" "q.trip_distance"
## [9] "q.fare_amount" "q.extra"
## [11] "f.mta_tax" "q.tip_amount"
## [13] "q.tolls_amount" "f.improvement_surcharge"
## [15] "target.total_amount" "f.payment_type"
## [17] "f.trip_type" "q.hour"
## [19] "f.period" "q.tlenkm"
## [21] "q.traveltime" "q.espeed"
## [23] "qual.pickup" "qual.dropoff"
## [25] "f.trip_distance_range" "target.tip_is_given"
## [27] "f.passenger_groups" "f.paid_tolls"
## [29] "f.cost" "f.tt"
Remove total amount equal to 0
df<-df[!(df$target.total_amount=="0"),]
We must create: f.cost, f.dist, f.tt and f.hour. We already have f.cost and f.tt, so we will only have to create f.dist and f.hour:
df$f.dist[df$q.trip_distance<=1.6] = "(0, 1.6]"
df$f.dist[(df$q.trip_distance>1.6) & (df$q.trip_distance<=3)] = "(1.6, 3]"
df$f.dist[(df$q.trip_distance>3) & (df$q.trip_distance<=5.5)] = "(3, 5.5]"
df$f.dist[(df$q.trip_distance>5.5) & (df$q.trip_distance<=30)] = "(5.5, 30]"
df$f.dist<-factor(df$f.dist)
df$f.hour[(df$q.hour>=17) & (df$q.hour<18)] = "17"
df$f.hour[(df$q.hour>=18) & (df$q.hour<19)] = "18"
df$f.hour[(df$q.hour>=19) & (df$q.hour<20)] = "19"
df$f.hour[(df$q.hour>=20) & (df$q.hour<21)] = "20"
df$f.hour[(df$q.hour>=21) & (df$q.hour<22)] = "21"
df$f.hour[(df$q.hour>=22) & (df$q.hour<23)] = "22"
df$f.hour[(df$q.hour<17)] = "other"
df$f.hour[(df$q.hour>=23)] = "other"
df$f.hour<-factor(df$f.hour)
df$f.espeed[(df$q.espeed>=3) & (df$q.espeed<10)] = "[03,10)"
df$f.espeed[(df$q.espeed>=10) & (df$q.espeed<20)] = "[10,20)"
df$f.espeed[(df$q.espeed>=20) & (df$q.espeed<30)] = "[20,30)"
df$f.espeed[(df$q.espeed>=30) & (df$q.espeed<40)] = "[30,40)"
df$f.espeed[(df$q.espeed>=40) & (df$q.espeed<50)] = "[40,50)"
df$f.espeed[(df$q.espeed>=50) & (df$q.espeed<=55)] = "[50,55]"
df$f.espeed<-factor(df$f.espeed)
vars_con<-names(df)[c(3:10,12:13,15,18,20:22)];
vars_dis<-names(df)[c(1:2,16,19,27:32)];
vars_res<-names(df)[c(15,27)];
vars_cexp<-vars_con[c(5:10,12:15)];
This variable will be the target for linear model building (connected to blocks Statistical Modeling I and II).
Before we begin to see correlations with our target, we should consider the normality of this.
hist(df$target.total_amount,50,freq=F,col="darkslateblue",border = "darkslateblue")
mm<-mean(df$target.total_amount);ss<-sd(df$target.total_amount)
curve(dnorm(x,mean=mm,sd=ss),col="red",lwd=2,lty=3, add=T)
shapiro.test(df$target.total_amount)
##
## Shapiro-Wilk normality test
##
## data: df$target.total_amount
## W = 0.73071, p-value < 2.2e-16
We see that the target total_amount is not normally distributed for the following reasons:
skewness(df$target.total_amount)
## [1] 3.176789
Normal data should have 0 skewness: we see that our data is right skewed (3.18).
kurtosis(df$target.total_amount)
## [1] 21.09556
Normal data should be 3. We have 21.1, so, in this case, our data is not normal.
res.con <- condes(df,num.var=which(names(df)=="target.total_amount"))
res.con$quanti
## correlation p.value
## q.fare_amount 0.94425003 0.000000e+00
## q.trip_distance 0.89702734 0.000000e+00
## q.tlenkm 0.88671294 0.000000e+00
## q.traveltime 0.76448863 0.000000e+00
## q.tip_amount 0.56622837 0.000000e+00
## q.espeed 0.39683909 9.313540e-174
## q.tolls_amount 0.25751662 9.659999e-71
## q.hour -0.03110910 3.465376e-02
## q.pickup_longitude -0.04064371 5.775239e-03
## q.dropoff_longitude -0.06391905 1.401371e-05
## q.pickup_latitude -0.12322848 4.560732e-17
## q.dropoff_latitude -0.14812217 4.926074e-24
Com hem pogut veure abans, les variables més correlacionades són:
res.con$quali
## R2 p.value
## f.trip_distance_range 0.567177647 0.000000e+00
## f.cost 0.908376615 0.000000e+00
## f.tt 0.539010171 0.000000e+00
## f.dist 0.636791987 0.000000e+00
## f.espeed 0.171132867 1.210354e-184
## f.paid_tolls 0.079593357 4.072991e-85
## target.tip_is_given 0.057803014 1.250800e-61
## f.payment_type 0.052910669 4.024719e-55
## f.code_rate_id 0.018930689 6.290954e-21
## f.mta_tax 0.005160632 1.044478e-06
## f.trip_type 0.003203349 1.204051e-04
## f.improvement_surcharge 0.002760154 3.583467e-04
## qual.dropoff 0.008369578 2.171667e-02
To talk about factor variables, we need to visualize res.con$quali. So let’s see:
vars_cexp
## [1] "q.passenger_count" "q.trip_distance" "q.fare_amount"
## [4] "q.extra" "q.tip_amount" "q.tolls_amount"
## [7] "q.hour" "q.tlenkm" "q.traveltime"
## [10] "q.espeed"
cor(df$q.trip_distance,df$q.tlenkm)
## [1] 0.9951289
To give an example, we see that the two distances we have, trip_distance and tlenkm, are closely related, since they represent the same.
model_1 <- lm(
target.total_amount~.
,data=df[,c("target.total_amount",vars_cexp)]
)
summary(model_1)
##
## Call:
## lm(formula = target.total_amount ~ ., data = df[, c("target.total_amount",
## vars_cexp)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.562 -0.198 -0.055 0.071 94.934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.153602 0.189353 11.373 < 2e-16 ***
## q.passenger_count 0.008078 0.036749 0.220 0.826033
## q.trip_distance 0.241864 0.160027 1.511 0.130756
## q.fare_amount 0.907127 0.014705 61.687 < 2e-16 ***
## q.extra 1.072076 0.107278 9.993 < 2e-16 ***
## q.tip_amount 1.045374 0.023134 45.189 < 2e-16 ***
## q.tolls_amount 1.032744 0.077728 13.287 < 2e-16 ***
## q.hour -0.000386 0.005808 -0.066 0.947009
## q.tlenkm 0.303267 0.091687 3.308 0.000948 ***
## q.traveltime -0.062887 0.008534 -7.369 2.02e-13 ***
## q.espeed -0.070566 0.007275 -9.700 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 4600 degrees of freedom
## Multiple R-squared: 0.934, Adjusted R-squared: 0.9338
## F-statistic: 6506 on 10 and 4600 DF, p-value: < 2.2e-16
Model_1 explains 93.4% of the variability of the target. We also see, according to the F-statistic, that it should be rejected.
We cannot use variables that are so correlated at the same time to act as explanatory variables. Therefore, we need to make a model in which we do not have these correlations.
But first, let’s see which of them are that correlated:
vif(model_1) # Check association between explanatory vars
## q.passenger_count q.trip_distance q.fare_amount q.extra
## 1.004241 137.215426 10.203484 1.071071
## q.tip_amount q.tolls_amount q.hour q.tlenkm
## 1.247479 1.069987 1.073015 116.473412
## q.traveltime q.espeed
## 5.069225 2.779880
When the variance inflation factor is greater than 5, we need to consider whether or not we keep a variable.
In this case we have to choose how far we stay. Since we work better with km than with miles (or inches, or whatever it is), we could choose the variable q.tlenkm.
model_1_bic <- step( model_1, k=log(nrow(df)) )
## Start: AIC=8826.82
## target.total_amount ~ q.passenger_count + q.trip_distance + q.fare_amount +
## q.extra + q.tip_amount + q.tolls_amount + q.hour + q.tlenkm +
## q.traveltime + q.espeed
##
## Df Sum of Sq RSS AIC
## - q.hour 1 0.0 30650 8818.4
## - q.passenger_count 1 0.3 30650 8818.4
## - q.trip_distance 1 15.2 30665 8820.7
## <none> 30649 8826.8
## - q.tlenkm 1 72.9 30722 8829.3
## - q.traveltime 1 361.8 31011 8872.5
## - q.espeed 1 626.9 31276 8911.8
## - q.extra 1 665.4 31315 8917.4
## - q.tolls_amount 1 1176.2 31826 8992.0
## - q.tip_amount 1 13605.8 44255 10512.3
## - q.fare_amount 1 25354.6 56004 11597.9
##
## Step: AIC=8818.39
## target.total_amount ~ q.passenger_count + q.trip_distance + q.fare_amount +
## q.extra + q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime +
## q.espeed
##
## Df Sum of Sq RSS AIC
## - q.passenger_count 1 0.3 30650 8810.0
## - q.trip_distance 1 15.3 30665 8812.2
## <none> 30650 8818.4
## - q.tlenkm 1 72.9 30722 8820.9
## - q.traveltime 1 362.0 31012 8864.1
## - q.espeed 1 629.8 31279 8903.7
## - q.extra 1 702.0 31351 8914.4
## - q.tolls_amount 1 1176.2 31826 8983.6
## - q.tip_amount 1 13611.9 44261 10504.5
## - q.fare_amount 1 25371.8 56021 11590.9
##
## Step: AIC=8810
## target.total_amount ~ q.trip_distance + q.fare_amount + q.extra +
## q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime +
## q.espeed
##
## Df Sum of Sq RSS AIC
## - q.trip_distance 1 15.2 30665 8803.9
## <none> 30650 8810.0
## - q.tlenkm 1 73.0 30723 8812.5
## - q.traveltime 1 362.1 31012 8855.7
## - q.espeed 1 629.6 31279 8895.3
## - q.extra 1 705.4 31355 8906.5
## - q.tolls_amount 1 1176.9 31827 8975.3
## - q.tip_amount 1 13614.4 44264 10496.3
## - q.fare_amount 1 25372.8 56023 11582.6
##
## Step: AIC=8803.85
## target.total_amount ~ q.fare_amount + q.extra + q.tip_amount +
## q.tolls_amount + q.tlenkm + q.traveltime + q.espeed
##
## Df Sum of Sq RSS AIC
## <none> 30665 8803.9
## - q.traveltime 1 387 31052 8853.2
## - q.espeed 1 615 31280 8886.9
## - q.extra 1 700 31365 8899.5
## - q.tolls_amount 1 1165 31830 8967.4
## - q.tlenkm 1 1873 32538 9068.8
## - q.tip_amount 1 13724 44389 10500.9
## - q.fare_amount 1 33519 64184 12201.2
The BIC has been eliminating the variables it has considered, without worsening the AIC. However, since it does not take into account either correlations or concepts, it is probably not optimal.
Let’s see how it turned out:
vif(model_1_bic)
## q.fare_amount q.extra q.tip_amount q.tolls_amount q.tlenkm
## 7.898396 1.008633 1.241575 1.065918 9.377307
## q.traveltime q.espeed
## 4.984224 2.717538
Note that tlenkm still has a vif greater than 5 (9.377307), and so does fare_amount (7.898396).
summary(model_1_bic)
##
## Call:
## lm(formula = target.total_amount ~ q.fare_amount + q.extra +
## q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime +
## q.espeed, data = df[, c("target.total_amount", vars_cexp)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.203 -0.196 -0.053 0.070 94.855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.103354 0.160998 13.064 < 2e-16 ***
## q.fare_amount 0.917656 0.012937 70.932 < 2e-16 ***
## q.extra 1.067019 0.104097 10.250 < 2e-16 ***
## q.tip_amount 1.047409 0.023077 45.387 < 2e-16 ***
## q.tolls_amount 1.025892 0.077574 13.225 < 2e-16 ***
## q.tlenkm 0.436186 0.026014 16.768 < 2e-16 ***
## q.traveltime -0.064484 0.008461 -7.621 3.04e-14 ***
## q.espeed -0.069090 0.007192 -9.606 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 4603 degrees of freedom
## Multiple R-squared: 0.9339, Adjusted R-squared: 0.9338
## F-statistic: 9295 on 7 and 4603 DF, p-value: < 2.2e-16
However, we see that it continues to explain much of the variability of our target (93.39%).
Therefore, we will try to make a model manually based on what model_1_bic has shown us and our knowledge of the data:
model_2 <- lm(
target.total_amount~
q.passenger_count +
q.fare_amount +
q.extra +
q.tip_amount +
q.tolls_amount +
q.hour +
q.tlenkm +
q.traveltime +
q.espeed
,
data=df[,c("target.total_amount",vars_cexp)]
)
summary(model_2)
##
## Call:
## lm(formula = target.total_amount ~ q.passenger_count + q.fare_amount +
## q.extra + q.tip_amount + q.tolls_amount + q.hour + q.tlenkm +
## q.traveltime + q.espeed, data = df[, c("target.total_amount",
## vars_cexp)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.205 -0.197 -0.052 0.071 94.859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1016961 0.1862386 11.285 < 2e-16 ***
## q.passenger_count 0.0074884 0.0367525 0.204 0.839
## q.fare_amount 0.9176846 0.0129422 70.907 < 2e-16 ***
## q.extra 1.0684221 0.1072657 9.961 < 2e-16 ***
## q.tip_amount 1.0475525 0.0230918 45.365 < 2e-16 ***
## q.tolls_amount 1.0257256 0.0775996 13.218 < 2e-16 ***
## q.hour -0.0005778 0.0058073 -0.100 0.921
## q.tlenkm 0.4361459 0.0260205 16.762 < 2e-16 ***
## q.traveltime -0.0645068 0.0084674 -7.618 3.1e-14 ***
## q.espeed -0.0691571 0.0072157 -9.584 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.582 on 4601 degrees of freedom
## Multiple R-squared: 0.9339, Adjusted R-squared: 0.9338
## F-statistic: 7226 on 9 and 4601 DF, p-value: < 2.2e-16
We see that the explainability is now 93.39%.
vif(model_2) # Check association between explanatory vars
## q.passenger_count q.fare_amount q.extra q.tip_amount
## 1.004128 7.901266 1.070527 1.242636
## q.tolls_amount q.hour q.tlenkm q.traveltime
## 1.066168 1.072503 9.378271 4.989265
## q.espeed
## 2.734212
Even so, owning one is still beyond the reach of the average person.
We try to make a new model without the distance:
model_3 <- lm(
target.total_amount~
q.passenger_count +
q.fare_amount +
q.extra +
q.tip_amount +
q.tolls_amount +
q.hour +
q.traveltime +
q.espeed
,
data=df[,c("target.total_amount",vars_cexp)]
)
summary(model_3)
##
## Call:
## lm(formula = target.total_amount ~ q.passenger_count + q.fare_amount +
## q.extra + q.tip_amount + q.tolls_amount + q.hour + q.traveltime +
## q.espeed, data = df[, c("target.total_amount", vars_cexp)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.322 -0.251 0.000 0.117 95.540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2903616 0.1562258 1.859 0.0631 .
## q.passenger_count 0.0132996 0.0378522 0.351 0.7253
## q.fare_amount 1.0440693 0.0108341 96.369 <2e-16 ***
## q.extra 1.1208455 0.1104332 10.150 <2e-16 ***
## q.tip_amount 1.0607708 0.0237700 44.627 <2e-16 ***
## q.tolls_amount 1.0842604 0.0798441 13.580 <2e-16 ***
## q.hour -0.0001983 0.0059813 -0.033 0.9736
## q.traveltime -0.0089434 0.0080250 -1.114 0.2651
## q.espeed 0.0052878 0.0058573 0.903 0.3667
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.659 on 4602 degrees of freedom
## Multiple R-squared: 0.9299, Adjusted R-squared: 0.9298
## F-statistic: 7630 on 8 and 4602 DF, p-value: < 2.2e-16
We see that the explainability is now 92.99%.
vif(model_3) # Check association between explanatory vars
## q.passenger_count q.fare_amount q.extra q.tip_amount
## 1.004039 5.219389 1.069616 1.241186
## q.tolls_amount q.hour q.traveltime q.espeed
## 1.064009 1.072486 4.224578 1.698328
The live ones are fine now. Still, we’ve pulled the distance, which conceptually we can’t afford. Therefore, we will try to remove another variable with a high vif (q.fare_amount), instead of q.tlenkm:
model_4 <- lm(
target.total_amount~
q.passenger_count +
q.extra +
q.tip_amount +
q.tolls_amount +
q.hour +
q.tlenkm +
q.traveltime +
q.espeed
,
data=df[,c("target.total_amount",vars_cexp)]
)
summary(model_4)
##
## Call:
## lm(formula = target.total_amount ~ q.passenger_count + q.extra +
## q.tip_amount + q.tolls_amount + q.hour + q.tlenkm + q.traveltime +
## q.espeed, data = df[, c("target.total_amount", vars_cexp)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.146 -0.613 -0.248 0.192 94.727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.548119 0.264727 17.180 < 2e-16 ***
## q.passenger_count 0.004933 0.053162 0.093 0.92607
## q.extra 0.552686 0.154800 3.570 0.00036 ***
## q.tip_amount 1.227130 0.033200 36.961 < 2e-16 ***
## q.tolls_amount 1.308155 0.112098 11.670 < 2e-16 ***
## q.hour 0.007250 0.008399 0.863 0.38806
## q.tlenkm 1.511058 0.030591 49.396 < 2e-16 ***
## q.traveltime 0.182147 0.011167 16.312 < 2e-16 ***
## q.espeed -0.054416 0.010433 -5.216 1.91e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.734 on 4602 degrees of freedom
## Multiple R-squared: 0.8617, Adjusted R-squared: 0.8615
## F-statistic: 3585 on 8 and 4602 DF, p-value: < 2.2e-16
We see that the explainability is now 86.17%.
vif(model_4) # Check association between explanatory vars
## q.passenger_count q.extra q.tip_amount q.tolls_amount
## 1.004128 1.065604 1.227688 1.063359
## q.hour q.tlenkm q.traveltime q.espeed
## 1.072115 6.195063 4.147204 2.731942
Despite having high vifs, we still have high explicability of the variability of our target and, given that the variable we have taken out we can remove with time and distance from the trip, we do not need it.
So we continue to stay with this variable and make new models. We apply BIC to help us a little:
model_4_bic <- step( model_4, k=log(nrow(df)) )
## Start: AIC=12217.36
## target.total_amount ~ q.passenger_count + q.extra + q.tip_amount +
## q.tolls_amount + q.hour + q.tlenkm + q.traveltime + q.espeed
##
## Df Sum of Sq RSS AIC
## - q.passenger_count 1 0 64174 12209
## - q.hour 1 10 64184 12210
## <none> 64174 12217
## - q.extra 1 178 64351 12222
## - q.espeed 1 379 64553 12236
## - q.tolls_amount 1 1899 66073 12343
## - q.traveltime 1 3710 67884 12468
## - q.tip_amount 1 19051 83224 13408
## - q.tlenkm 1 34025 98198 14170
##
## Step: AIC=12208.94
## target.total_amount ~ q.extra + q.tip_amount + q.tolls_amount +
## q.hour + q.tlenkm + q.traveltime + q.espeed
##
## Df Sum of Sq RSS AIC
## - q.hour 1 10 64184 12201
## <none> 64174 12209
## - q.extra 1 179 64352 12213
## - q.espeed 1 379 64553 12228
## - q.tolls_amount 1 1900 66073 12335
## - q.traveltime 1 3710 67884 12460
## - q.tip_amount 1 19056 83230 13399
## - q.tlenkm 1 34030 98204 14162
##
## Step: AIC=12201.24
## target.total_amount ~ q.extra + q.tip_amount + q.tolls_amount +
## q.tlenkm + q.traveltime + q.espeed
##
## Df Sum of Sq RSS AIC
## <none> 64184 12201
## - q.extra 1 211 64395 12208
## - q.espeed 1 391 64575 12221
## - q.tolls_amount 1 1902 66086 12328
## - q.traveltime 1 3703 67887 12451
## - q.tip_amount 1 19088 83272 13393
## - q.tlenkm 1 34063 98247 14156
Following BIC, we have to eliminate variables until the vif’s are less than 5. Therefore, the model that meets this is:
model_5 <- lm(
target.total_amount~
q.passenger_count +
q.extra +
q.tip_amount +
q.tolls_amount +
q.tlenkm +
q.traveltime
,
data=df
)
summary(model_5)
##
## Call:
## lm(formula = target.total_amount ~ q.passenger_count + q.extra +
## q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.380 -0.644 -0.251 0.211 94.956
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.582803 0.125371 28.578 < 2e-16 ***
## q.passenger_count 0.001889 0.053304 0.035 0.972
## q.extra 0.605472 0.150868 4.013 6.08e-05 ***
## q.tip_amount 1.223749 0.033279 36.773 < 2e-16 ***
## q.tolls_amount 1.307289 0.112420 11.629 < 2e-16 ***
## q.tlenkm 1.385255 0.019221 72.070 < 2e-16 ***
## q.traveltime 0.221884 0.008248 26.901 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.745 on 4604 degrees of freedom
## Multiple R-squared: 0.8609, Adjusted R-squared: 0.8607
## F-statistic: 4748 on 6 and 4604 DF, p-value: < 2.2e-16
We see that the explainability is now 86.09%
vif(model_5) # Check association between explanatory vars
## q.passenger_count q.extra q.tip_amount q.tolls_amount
## 1.003687 1.006299 1.226347 1.063286
## q.tlenkm q.traveltime
## 2.431645 2.249571
There is no vif that exceeds 5.
Let’s now discriminate the variables independently:
marginalModelPlots(model_5)
We see that there is not much mismatch of the marginal variables. If there were any, we would have to transform our explanatory variables.
par(mfrow=c(2,2))
plot(model_5, id.n=0 )
par(mfrow=c(1,1))
Looking at the results, we can say that:
All this is due to the fact that our target variable was no longer normally distributed. To solve this, we apply the logarithm:
model_6 <- lm(
log(target.total_amount)~
q.passenger_count +
q.extra +
q.tip_amount +
q.tolls_amount +
q.tlenkm +
q.traveltime
,
data=df
)
summary(model_6)
##
## Call:
## lm(formula = log(target.total_amount) ~ q.passenger_count + q.extra +
## q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.49383 -0.10927 0.03793 0.14491 2.68692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8572872 0.0084592 219.558 < 2e-16 ***
## q.passenger_count -0.0014091 0.0035967 -0.392 0.695
## q.extra 0.0704555 0.0101797 6.921 5.09e-12 ***
## q.tip_amount 0.0624228 0.0022454 27.800 < 2e-16 ***
## q.tolls_amount 0.0308942 0.0075854 4.073 4.72e-05 ***
## q.tlenkm 0.0550138 0.0012969 42.419 < 2e-16 ***
## q.traveltime 0.0220808 0.0005565 39.676 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2527 on 4604 degrees of freedom
## Multiple R-squared: 0.7951, Adjusted R-squared: 0.7948
## F-statistic: 2978 on 6 and 4604 DF, p-value: < 2.2e-16
We see that when doing the logarithm, the coefficient of determination is getting lower and lower, now it is 79.51%. We have seen that it has gotten worse than the previous model. Therefore, we discard it. We will work with model_5.
However, let’s remember the last three models we used:
According to the coefficient of explicability, the ranking is: model_4 >> model_5 >> model_6. As for the VIFs, however, the ranking is: model_6 >> model_5 >> model_4. Since VIFs are acceptable on both model_5 and model_6, and not acceptable on model_4, the smartest option is to choose model_5.
So, let’s look at the effects of this model:
Anova(model_5)
## Anova Table (Type II tests)
##
## Response: target.total_amount
## Sum Sq Df F value Pr(>F)
## q.passenger_count 0 1 0.0013 0.9717
## q.extra 226 1 16.1062 6.084e-05 ***
## q.tip_amount 18966 1 1352.2380 < 2.2e-16 ***
## q.tolls_amount 1897 1 135.2241 < 2.2e-16 ***
## q.tlenkm 72851 1 5194.0555 < 2.2e-16 ***
## q.traveltime 10150 1 723.6844 < 2.2e-16 ***
## Residuals 64575 4604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We see that now the net effects are significant.
library(effects)
plot(allEffects(model_5))
We see that our model defines the following:
par(mfrow=c(2,2))
plot(model_5, id.n=0 )
par(mfrow=c(1,1))
We see that the residues are not completely optimal.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
boxcox(
target.total_amount~
q.passenger_count +
q.extra +
q.tip_amount +
q.tolls_amount +
q.tlenkm +
q.traveltime
,
data=df
)
We see the lambda parameter estimation method in the boxcox method. This gives us an idea of the power to which we need to raise the target variable in order to improve the properties of the linear model.
It is worth trying a new model with a square root in the target variable:
model_7 <- lm(
sqrt(target.total_amount)~
q.passenger_count +
q.extra +
q.tip_amount +
q.tolls_amount +
q.tlenkm +
q.traveltime
,
data=df
)
summary(model_7)
##
## Call:
## lm(formula = sqrt(target.total_amount) ~ q.passenger_count +
## q.extra + q.tip_amount + q.tolls_amount + q.tlenkm + q.traveltime,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7437 -0.1380 0.0139 0.1508 7.4872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3699317 0.0136357 173.804 < 2e-16 ***
## q.passenger_count -0.0013314 0.0057976 -0.230 0.818
## q.extra 0.0977427 0.0164089 5.957 2.77e-09 ***
## q.tip_amount 0.1318869 0.0036195 36.438 < 2e-16 ***
## q.tolls_amount 0.1030452 0.0122272 8.428 < 2e-16 ***
## q.tlenkm 0.1322517 0.0020905 63.262 < 2e-16 ***
## q.traveltime 0.0357927 0.0008971 39.899 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4073 on 4604 degrees of freedom
## Multiple R-squared: 0.8641, Adjusted R-squared: 0.8639
## F-statistic: 4879 on 6 and 4604 DF, p-value: < 2.2e-16
We see that the coefficient has improved, from 85.09% (model_5) to 86.41% (model_7). But … is it worth it from a residual point of view?
par(mfrow=c(2,2))
plot( model_7, id.n=0 )
par(mfrow=c(1,1))
We see we haven’t won too much. So we stick to model_5.
model_8<-lm(log(target.total_amount)~ q.extra + q.tip_amount + q.tolls_amount + f.improvement_surcharge + q.espeed + log(q.tlenkm), data=df)
summary(model_8)
##
## Call:
## lm(formula = log(target.total_amount) ~ q.extra + q.tip_amount +
## q.tolls_amount + f.improvement_surcharge + q.espeed + log(q.tlenkm),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.14903 -0.06792 -0.01991 0.05069 2.77861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0982020 0.0205582 102.061 < 2e-16 ***
## q.extra 0.0884882 0.0079393 11.146 < 2e-16 ***
## q.tip_amount 0.0655898 0.0017109 38.337 < 2e-16 ***
## q.tolls_amount 0.0428318 0.0058348 7.341 2.5e-13 ***
## f.improvement_surchargeYes -0.2523217 0.0194490 -12.974 < 2e-16 ***
## q.espeed -0.0091816 0.0003899 -23.550 < 2e-16 ***
## log(q.tlenkm) 0.6191131 0.0044464 139.239 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1953 on 4604 degrees of freedom
## Multiple R-squared: 0.8777, Adjusted R-squared: 0.8775
## F-statistic: 5505 on 6 and 4604 DF, p-value: < 2.2e-16
We see that the explainability is now 87.77%. The more influent effects in this models are the length in km of the trip and the tip amount given.
Anova(model_8)
## Anova Table (Type II tests)
##
## Response: log(target.total_amount)
## Sum Sq Df F value Pr(>F)
## q.extra 4.74 1 124.225 < 2.2e-16 ***
## q.tip_amount 56.03 1 1469.717 < 2.2e-16 ***
## q.tolls_amount 2.05 1 53.886 2.499e-13 ***
## f.improvement_surcharge 6.42 1 168.312 < 2.2e-16 ***
## q.espeed 21.14 1 554.595 < 2.2e-16 ***
## log(q.tlenkm) 739.16 1 19387.533 < 2.2e-16 ***
## Residuals 175.53 4604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(model_8)
## q.extra q.tip_amount q.tolls_amount
## 1.025199 1.192442 1.053741
## f.improvement_surcharge q.espeed log(q.tlenkm)
## 1.027504 1.395417 1.545375
residualPlots(model_8)
## Test stat Pr(>|Test stat|)
## q.extra 5.5432 3.135e-08 ***
## q.tip_amount -4.5251 6.189e-06 ***
## q.tolls_amount 0.0307 0.9755
## f.improvement_surcharge
## q.espeed 13.5154 < 2.2e-16 ***
## log(q.tlenkm) 13.8598 < 2.2e-16 ***
## Tukey test -0.6750 0.4997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# vars_enum<-c("q.extra","q.tip_amount","q.tolls_amount","f.improvement_surcharge","tlenkm")
# vars_edis<-c("VendorID","RateCodeID","Payment_type","period")
#
df$f.extra <- factor(df$q.extra)
model_9<-lm(
log(target.total_amount)~
f.extra +
q.tip_amount +
q.tolls_amount +
f.improvement_surcharge +
q.espeed +
log(q.tlenkm)
,data=df
)
BIC(model_8,model_9)
## df BIC
## model_8 8 -1917.617
## model_9 9 -1939.860
We can see from the BIC that the model_9 is better than the model_8, so it is correct to consider extra as factor. Next, we will do the same with the tolls_amount and use the factor we had already created (paid_tolls).
model_10<-lm(
log(target.total_amount)~
f.extra +
q.tip_amount +
f.paid_tolls +
f.improvement_surcharge +
q.espeed +
log(q.tlenkm)
,data=df
)
BIC(model_8,model_9,model_10)
## df BIC
## model_8 8 -1917.617
## model_9 9 -1939.860
## model_10 9 -1944.606
We see can see that it is correct to use the paid_tolls factor to improve our model. We will try it now with the effective speed.
model_11<-lm(
log(target.total_amount)~
f.extra +
q.tip_amount +
f.paid_tolls +
f.improvement_surcharge +
f.espeed +
log(q.tlenkm)
,data=df
)
BIC(model_8,model_9,model_10,model_11)
## df BIC
## model_8 8 -1917.617
## model_9 9 -1939.860
## model_10 9 -1944.606
## model_11 13 -1963.320
We can see that the best approach is the model_10, so we are going to stick to it for now.
model_12 <- model_10
Anova(model_12)
## Anova Table (Type II tests)
##
## Response: log(target.total_amount)
## Sum Sq Df F value Pr(>F)
## f.extra 5.89 2 77.880 < 2.2e-16 ***
## q.tip_amount 55.28 1 1460.732 < 2.2e-16 ***
## f.paid_tolls 2.12 1 55.915 9.007e-14 ***
## f.improvement_surcharge 5.88 1 155.314 < 2.2e-16 ***
## q.espeed 18.07 1 477.567 < 2.2e-16 ***
## log(q.tlenkm) 730.06 1 19292.288 < 2.2e-16 ***
## Residuals 174.19 4603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model_12)
##
## Call:
## lm(formula = log(target.total_amount) ~ f.extra + q.tip_amount +
## f.paid_tolls + f.improvement_surcharge + q.espeed + log(q.tlenkm),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.13181 -0.06786 -0.01713 0.04833 2.75572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0895877 0.0205470 101.698 < 2e-16 ***
## f.extra0.5 0.0158044 0.0064600 2.446 0.0145 *
## f.extra1 0.1027775 0.0083225 12.349 < 2e-16 ***
## q.tip_amount 0.0653075 0.0017087 38.220 < 2e-16 ***
## f.paid_tollsYes 0.2296901 0.0307168 7.478 9.01e-14 ***
## f.improvement_surchargeYes -0.2424837 0.0194571 -12.462 < 2e-16 ***
## q.espeed -0.0087026 0.0003982 -21.853 < 2e-16 ***
## log(q.tlenkm) 0.6171457 0.0044432 138.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1945 on 4603 degrees of freedom
## Multiple R-squared: 0.8786, Adjusted R-squared: 0.8784
## F-statistic: 4759 on 7 and 4603 DF, p-value: < 2.2e-16
We can see from the Anova test that f.extra has 2 freedom degrees and globally it does have a significant net effect once the other variables are in the model.
We are going to take a look at the residues.
par(mfrow=c(2,2))
plot( model_12, id.n=0 )
par(mfrow=c(1,1))
Looking at the results, we can say that:
We proceed to take a look at the influence plot to check our influent residuals for model_12.
influencePlot( model_12, id=c(list="noteworthy",n=5))
## StudRes Hat CookD
## 194151 -1.4165961 0.027830775 0.0071794393
## 211894 11.8720921 0.010543346 0.1821960413
## 604912 -2.2551896 0.034191469 0.0224862744
## 636795 10.7789754 0.010322738 0.1477856180
## 638666 -2.3209123 0.035346786 0.0246486143
## 646551 12.3519800 0.001824547 0.0337490049
## 710390 -1.2640287 0.027396540 0.0056250635
## 761529 10.6109774 0.010077102 0.1398787908
## 896291 0.4256356 0.027534736 0.0006413128
## 952547 12.6484050 0.002931994 0.0568424885
## 1208612 14.4971050 0.001762369 0.0443645191
## 1237379 14.0568677 0.009901130 0.2368806919
## 1345546 -11.1335622 0.012642289 0.1932327624
We see this model as a disaster. That is, we have a student waste of the order of 35. We can confirm that this is too much. We have to compare student waste with a normal standard. Therefore, we would say that the model we have so far is a model that has a serious waste problem.
Intento treure outliers multivariants de total_amount i tlenkm:
library(mvoutlier)
library(chemometrics)
multivariant_outliers <- Moutlier(df[, c(15,20)], quantile = 0.995)
multivariant_outliers$cutoff
## [1] 3.255247
par(mfrow=c(1,1))
plot(multivariant_outliers$md, multivariant_outliers$rd, type="n")
text(multivariant_outliers$md, multivariant_outliers$rd, labels=rownames(df[, c(15,20)]), cex=0.5)
ll_mvoutliers<-c('1237379', '1208612', '1171898', '488540', '211894', '638666', '329000', '1175981', '604912')
df <- df[!(row.names(df) %in% ll_mvoutliers),]
multivariant_outliers <- Moutlier(df[, c(15,20)], quantile = 0.995)
multivariant_outliers$cutoff
## [1] 3.255247
par(mfrow=c(1,1))
plot(multivariant_outliers$md, multivariant_outliers$rd, type="n")
text(multivariant_outliers$md, multivariant_outliers$rd, labels=rownames(df[, c(15,20)]), cex=0.75)
In order for this not to happen to us, we need to work on the variable q.tlenkm.
So let’s create a new model that does not give so many problems:
model_13<-lm(
log(target.total_amount)~
f.extra +
q.tip_amount +
f.paid_tolls +
f.improvement_surcharge +
q.espeed +
log(q.tlenkm)
,data=df
)
summary(model_13)
##
## Call:
## lm(formula = log(target.total_amount) ~ f.extra + q.tip_amount +
## f.paid_tolls + f.improvement_surcharge + q.espeed + log(q.tlenkm),
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10502 -0.06679 -0.01703 0.04902 2.42599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0557085 0.0190514 107.903 < 2e-16 ***
## f.extra0.5 0.0175034 0.0059203 2.957 0.00313 **
## f.extra1 0.0999597 0.0076298 13.101 < 2e-16 ***
## q.tip_amount 0.0654379 0.0015946 41.038 < 2e-16 ***
## f.paid_tollsYes 0.2460097 0.0286456 8.588 < 2e-16 ***
## f.improvement_surchargeYes -0.2110400 0.0180607 -11.685 < 2e-16 ***
## q.espeed -0.0089655 0.0003656 -24.521 < 2e-16 ***
## log(q.tlenkm) 0.6234997 0.0040831 152.702 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1782 on 4594 degrees of freedom
## Multiple R-squared: 0.8959, Adjusted R-squared: 0.8957
## F-statistic: 5648 on 7 and 4594 DF, p-value: < 2.2e-16
vif(model_13)
## GVIF Df GVIF^(1/(2*Df))
## f.extra 1.084371 2 1.020456
## q.tip_amount 1.182362 1 1.087365
## f.paid_tolls 1.050503 1 1.024941
## f.improvement_surcharge 1.034810 1 1.017256
## q.espeed 1.457073 1 1.207093
## log(q.tlenkm) 1.544211 1 1.242663
influencePlot( model_13, id=c(list="noteworthy",n=5))
## StudRes Hat CookD
## 121215 0.4203763 0.0283169950 0.0006438531
## 194151 -1.6787049 0.0297354741 0.0107912464
## 360250 -2.4687439 0.0287198240 0.0225018374
## 636795 12.0427760 0.0106079001 0.1884592859
## 646551 13.5889482 0.0018308364 0.0407101663
## 710390 -1.5203565 0.0292595595 0.0087064599
## 761529 11.8520824 0.0103596683 0.1783934856
## 856112 12.6825286 0.0009709111 0.0188829083
## 892761 7.3439088 0.0157633181 0.1067424678
## 896291 0.3892795 0.0283833874 0.0005534554
## 952547 13.9202623 0.0029422996 0.0685992790
## 1204489 11.2790483 0.0104108324 0.1628225229
## 1345546 -12.0786387 0.0129275449 0.2315405486
After doing certain tests, taking into account the influences, the coefficients of explicability and the vifs, we decided that the best we can get is a model where q.tlenkm does not apply any operation.
So let’s analyze it:
residualPlots(model_13)
## Test stat Pr(>|Test stat|)
## f.extra
## q.tip_amount -4.3322 1.508e-05 ***
## f.paid_tolls
## f.improvement_surcharge
## q.espeed 14.0221 < 2.2e-16 ***
## log(q.tlenkm) 15.5948 < 2.2e-16 ***
## Tukey test 1.0019 0.3164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In the residualPlots, what we find is, for each factor, a boxplot of its categories and, for each quantitative variable, a pearson graph.
Let’s use another tool to fully understand our model:
marginalModelPlots(model_13)
## Warning in mmps(...): Interactions and/or factors skipped
In relation to the variable q.tip_amount, we see that there is a bit of mismatch, but not much, since tips given in cash are always declared as 0. Therefore, the data are not entirely real.
As for the variable q.tlenkm, we see that some observations do not follow the required pattern, and we have to modify them in some way.
How do we do that?
ll1<-Boxplot(rstudent(model_13));ll1
## [1] 4269 80 2621 4051 1385 3035 3802 2666 3211 2299 3021 2032 2711 2005 2434
## [16] 1978 3838 4 3808 4243
ll1<-c(4269, 80, 2621)
df[ll1,]
## f.vendor_id f.code_rate_id q.pickup_longitude q.pickup_latitude
## 1345546 f.Vendor-VeriFone Rate-Other -73.92619 40.76569
## 24990 f.Vendor-Mobile Rate-1 -73.95438 40.80410
## 825427 f.Vendor-Mobile Rate-1 -73.93534 40.63492
## q.dropoff_longitude q.dropoff_latitude q.passenger_count
## 1345546 -73.93353 40.76379 1
## 24990 -73.95515 40.80468 1
## 825427 -73.93534 40.63492 1
## q.trip_distance q.fare_amount q.extra f.mta_tax q.tip_amount
## 1345546 10.42 5.0 0.0 No 0
## 24990 5.60 2.5 0.5 Yes 0
## 825427 5.50 2.5 0.5 Yes 0
## q.tolls_amount f.improvement_surcharge target.total_amount
## 1345546 0 No 5.0
## 24990 0 Yes 3.8
## 825427 0 Yes 3.8
## f.payment_type f.trip_type q.hour f.period q.tlenkm q.traveltime
## 1345546 Cash Dispatch 9 Period morning 16.769364 60.0000000
## 24990 No paid Street-Hail 3 Period night 9.012326 0.5333333
## 825427 No paid Street-Hail 0 Period night 8.851392 0.2666667
## q.espeed qual.pickup qual.dropoff f.trip_distance_range
## 1345546 11.06889 09 11 Long_dist
## 24990 23.16672 03 03 Long_dist
## 825427 23.05353 00 00 Long_dist
## target.tip_is_given f.passenger_groups f.paid_tolls f.cost f.tt
## 1345546 No Single No [0,8] (20,60]
## 24990 No Single No [0,8] [0,5]
## 825427 No Single No [0,8] [0,5]
## f.dist f.hour f.espeed f.extra
## 1345546 (5.5, 30] other [10,20) 0
## 24990 (5.5, 30] other [20,30) 0.5
## 825427 (3, 5.5] other [20,30) 0.5
Let’s see the strangest:
Veiem que són observacionsa vastant normals. Tot i això, per exemple, podem destacar que la observació 4269, a la qual ja se li aplica una tarifa de 5$, per molts km és que hagi fet, el preu no ha pujat.
We do the same for the cook distance:
ll4 <- Boxplot(cooks.distance(model_13));ll4
## [1] 4269 2005 2434 3838 2837 4 4524 3808 4051 3021
ll4<-c(4269, 2005, 2434)
df[ll4,]
## f.vendor_id f.code_rate_id q.pickup_longitude q.pickup_latitude
## 1345546 f.Vendor-VeriFone Rate-Other -73.92619 40.76569
## 636795 f.Vendor-VeriFone Rate-Other -73.96568 40.68322
## 761529 f.Vendor-VeriFone Rate-Other -73.94013 40.71141
## q.dropoff_longitude q.dropoff_latitude q.passenger_count
## 1345546 -73.93353 40.76379 1
## 636795 -73.96699 40.68422 1
## 761529 -73.93863 40.71203 4
## q.trip_distance q.fare_amount q.extra f.mta_tax q.tip_amount
## 1345546 10.42000 5.00 0 No 0
## 636795 6.39489 50.00 0 No 0
## 761529 0.05000 49.99 0 No 0
## q.tolls_amount f.improvement_surcharge target.total_amount
## 1345546 0 No 5.00
## 636795 0 No 50.00
## 761529 0 No 49.99
## f.payment_type f.trip_type q.hour f.period q.tlenkm q.traveltime
## 1345546 Cash Dispatch 9 Period morning 16.76936 60.00000000
## 636795 Cash Dispatch 16 Period valley 1.00000 1.26666667
## 761529 Credit card Dispatch 21 Period night 1.00000 0.03333333
## q.espeed qual.pickup qual.dropoff f.trip_distance_range
## 1345546 11.06889 09 11 Long_dist
## 636795 27.33968 16 16 Short_dist
## 761529 23.79045 21 21 Short_dist
## target.tip_is_given f.passenger_groups f.paid_tolls f.cost f.tt
## 1345546 No Single No [0,8] (20,60]
## 636795 No Single No (30,50] [0,5]
## 761529 No Group No (30,50] [0,5]
## f.dist f.hour f.espeed f.extra
## 1345546 (5.5, 30] other [10,20) 0
## 636795 (5.5, 30] other [20,30) 0
## 761529 (0, 1.6] 21 [20,30) 0
We see that, apart from the first, explained above, the other two observations have a trip length of 1km, but instead has been paid about $ 50. We see that this is not possible.
It is necessary to eliminate these observations that do not have the same tendency as our model:
dfred<-df[-ll4,]
model_14<-lm(
log(target.total_amount)~
f.extra +
q.tip_amount +
f.paid_tolls +
f.improvement_surcharge +
q.espeed +
log(q.tlenkm)
,data=dfred
)
summary(model_14)
##
## Call:
## lm(formula = log(target.total_amount) ~ f.extra + q.tip_amount +
## f.paid_tolls + f.improvement_surcharge + q.espeed + log(q.tlenkm),
## data = dfred)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.69585 -0.06668 -0.01671 0.04908 2.43663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0373806 0.0184125 110.652 < 2e-16 ***
## f.extra0.5 0.0184093 0.0056474 3.260 0.00112 **
## f.extra1 0.0997061 0.0072780 13.700 < 2e-16 ***
## q.tip_amount 0.0650028 0.0015213 42.730 < 2e-16 ***
## f.paid_tollsYes 0.2453415 0.0273246 8.979 < 2e-16 ***
## f.improvement_surchargeYes -0.1914635 0.0174708 -10.959 < 2e-16 ***
## q.espeed -0.0093036 0.0003492 -26.642 < 2e-16 ***
## log(q.tlenkm) 0.6286084 0.0039030 161.059 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1699 on 4591 degrees of freedom
## Multiple R-squared: 0.905, Adjusted R-squared: 0.9049
## F-statistic: 6248 on 7 and 4591 DF, p-value: < 2.2e-16
Anova(model_14)
## Anova Table (Type II tests)
##
## Response: log(target.total_amount)
## Sum Sq Df F value Pr(>F)
## f.extra 5.48 2 94.850 < 2.2e-16 ***
## q.tip_amount 52.73 1 1825.836 < 2.2e-16 ***
## f.paid_tolls 2.33 1 80.619 < 2.2e-16 ***
## f.improvement_surcharge 3.47 1 120.101 < 2.2e-16 ***
## q.espeed 20.50 1 709.789 < 2.2e-16 ***
## log(q.tlenkm) 749.16 1 25940.109 < 2.2e-16 ***
## Residuals 132.59 4591
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(model_14)
## GVIF Df GVIF^(1/(2*Df))
## f.extra 1.083640 2 1.020284
## q.tip_amount 1.182486 1 1.087422
## f.paid_tolls 1.050503 1 1.024941
## f.improvement_surcharge 1.033891 1 1.016804
## q.espeed 1.460196 1 1.208386
## log(q.tlenkm) 1.547842 1 1.244123
We see that the coefficient of determination has increased a bit and it seems that we have no collinearity problems.
names(df)
## [1] "f.vendor_id" "f.code_rate_id"
## [3] "q.pickup_longitude" "q.pickup_latitude"
## [5] "q.dropoff_longitude" "q.dropoff_latitude"
## [7] "q.passenger_count" "q.trip_distance"
## [9] "q.fare_amount" "q.extra"
## [11] "f.mta_tax" "q.tip_amount"
## [13] "q.tolls_amount" "f.improvement_surcharge"
## [15] "target.total_amount" "f.payment_type"
## [17] "f.trip_type" "q.hour"
## [19] "f.period" "q.tlenkm"
## [21] "q.traveltime" "q.espeed"
## [23] "qual.pickup" "qual.dropoff"
## [25] "f.trip_distance_range" "target.tip_is_given"
## [27] "f.passenger_groups" "f.paid_tolls"
## [29] "f.cost" "f.tt"
## [31] "f.dist" "f.hour"
## [33] "f.espeed" "f.extra"
model_15<-lm(
log(target.total_amount) ~
q.tip_amount +
log(q.tlenkm)+
f.paid_tolls+
f.improvement_surcharge +
f.espeed +
f.extra +
f.code_rate_id +
f.vendor_id +
f.payment_type+
f.period
,data=df
)
summary(model_15)
##
## Call:
## lm(formula = log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) +
## f.paid_tolls + f.improvement_surcharge + f.espeed + f.extra +
## f.code_rate_id + f.vendor_id + f.payment_type + f.period,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07100 -0.06106 -0.01212 0.05413 2.33447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4250185 0.0428280 33.273 < 2e-16 ***
## q.tip_amount 0.0517313 0.0019163 26.995 < 2e-16 ***
## log(q.tlenkm) 0.6209633 0.0038383 161.782 < 2e-16 ***
## f.paid_tollsYes 0.1448719 0.0273812 5.291 1.27e-07 ***
## f.improvement_surchargeYes 0.5178918 0.0402982 12.852 < 2e-16 ***
## f.espeed[10,20) -0.1944393 0.0114657 -16.958 < 2e-16 ***
## f.espeed[20,30) -0.2883868 0.0122033 -23.632 < 2e-16 ***
## f.espeed[30,40) -0.3398952 0.0149364 -22.756 < 2e-16 ***
## f.espeed[40,50) -0.3606616 0.0189198 -19.063 < 2e-16 ***
## f.espeed[50,55] -0.4385135 0.0261803 -16.750 < 2e-16 ***
## f.extra0.5 0.0259337 0.0090278 2.873 0.00409 **
## f.extra1 0.1020348 0.0085383 11.950 < 2e-16 ***
## f.code_rate_idRate-Other 0.7687656 0.0387554 19.836 < 2e-16 ***
## f.vendor_idf.Vendor-VeriFone -0.0026786 0.0061663 -0.434 0.66402
## f.payment_typeCash -0.0680012 0.0064312 -10.574 < 2e-16 ***
## f.payment_typeNo paid -0.2428288 0.0320752 -7.571 4.46e-14 ***
## f.periodPeriod morning 0.0009375 0.0113906 0.082 0.93441
## f.periodPeriod valley 0.0069741 0.0097913 0.712 0.47634
## f.periodPeriod afternoon 0.0029100 0.0085276 0.341 0.73293
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1677 on 4583 degrees of freedom
## Multiple R-squared: 0.908, Adjusted R-squared: 0.9076
## F-statistic: 2513 on 18 and 4583 DF, p-value: < 2.2e-16
Anova(model_15)
## Anova Table (Type II tests)
##
## Response: log(target.total_amount)
## Sum Sq Df F value Pr(>F)
## q.tip_amount 20.49 1 728.7238 < 2.2e-16 ***
## log(q.tlenkm) 735.91 1 26173.4058 < 2.2e-16 ***
## f.paid_tolls 0.79 1 27.9939 1.274e-07 ***
## f.improvement_surcharge 4.64 1 165.1611 < 2.2e-16 ***
## f.espeed 22.49 5 159.9773 < 2.2e-16 ***
## f.extra 4.08 2 72.5752 < 2.2e-16 ***
## f.code_rate_id 11.06 1 393.4798 < 2.2e-16 ***
## f.vendor_id 0.01 1 0.1887 0.6640
## f.payment_type 4.19 2 74.5335 < 2.2e-16 ***
## f.period 0.02 3 0.2629 0.8522
## Residuals 128.86 4583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Veiem que, de totes les variables explicatives introduides noves, les que podem salvar són:
Creem un nou model amb aquestes:
model_16<-lm(
log(target.total_amount) ~
q.tip_amount +
log(q.tlenkm)+
f.paid_tolls+
f.espeed +
f.extra +
f.code_rate_id +
f.payment_type+
f.period
,data=df
)
anova(model_15, model_16)
## Analysis of Variance Table
##
## Model 1: log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.improvement_surcharge + f.espeed + f.extra + f.code_rate_id +
## f.vendor_id + f.payment_type + f.period
## Model 2: log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4583 128.86
## 2 4585 133.50 -2 -4.6445 82.594 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Veiem que no ens hem perdut res.
model_17<-lm(
log(target.total_amount) ~
(q.tip_amount + log(q.tlenkm))*(f.paid_tolls + f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period)
,data=df
)
model_17<-step( model_17, k=log(nrow(df)))
## Start: AIC=-17256.64
## log(target.total_amount) ~ (q.tip_amount + log(q.tlenkm)) * (f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period)
##
##
## Step: AIC=-17256.64
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period +
## q.tip_amount:f.paid_tolls + q.tip_amount:f.espeed + q.tip_amount:f.extra +
## q.tip_amount:f.code_rate_id + q.tip_amount:f.period + log(q.tlenkm):f.paid_tolls +
## log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id +
## log(q.tlenkm):f.payment_type + log(q.tlenkm):f.period
##
## Df Sum of Sq RSS AIC
## - log(q.tlenkm):f.period 3 0.0047 100.05 -17282
## - q.tip_amount:f.period 3 0.0259 100.07 -17281
## - q.tip_amount:f.extra 2 0.0639 100.11 -17271
## - log(q.tlenkm):f.paid_tolls 1 0.0581 100.10 -17262
## <none> 100.05 -17257
## - q.tip_amount:f.paid_tolls 1 0.2062 100.25 -17256
## - log(q.tlenkm):f.extra 2 0.9401 100.99 -17230
## - log(q.tlenkm):f.espeed 5 1.7854 101.83 -17217
## - q.tip_amount:f.espeed 5 1.7942 101.84 -17217
## - log(q.tlenkm):f.payment_type 2 2.7241 102.77 -17150
## - q.tip_amount:f.code_rate_id 1 3.2467 103.29 -17118
## - log(q.tlenkm):f.code_rate_id 1 24.4450 124.49 -16259
##
## Step: AIC=-17281.72
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period +
## q.tip_amount:f.paid_tolls + q.tip_amount:f.espeed + q.tip_amount:f.extra +
## q.tip_amount:f.code_rate_id + q.tip_amount:f.period + log(q.tlenkm):f.paid_tolls +
## log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id +
## log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## - q.tip_amount:f.period 3 0.0232 100.07 -17306
## - q.tip_amount:f.extra 2 0.0616 100.11 -17296
## - log(q.tlenkm):f.paid_tolls 1 0.0584 100.11 -17288
## <none> 100.05 -17282
## - q.tip_amount:f.paid_tolls 1 0.2076 100.26 -17281
## - log(q.tlenkm):f.espeed 5 1.7923 101.84 -17242
## - q.tip_amount:f.espeed 5 1.7956 101.85 -17242
## - log(q.tlenkm):f.extra 2 1.6509 101.70 -17223
## - log(q.tlenkm):f.payment_type 2 2.7324 102.78 -17175
## - q.tip_amount:f.code_rate_id 1 3.2471 103.30 -17143
## - log(q.tlenkm):f.code_rate_id 1 25.3794 125.43 -16250
##
## Step: AIC=-17305.96
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + f.period +
## q.tip_amount:f.paid_tolls + q.tip_amount:f.espeed + q.tip_amount:f.extra +
## q.tip_amount:f.code_rate_id + log(q.tlenkm):f.paid_tolls +
## log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id +
## log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## - f.period 3 0.1722 100.25 -17323
## - q.tip_amount:f.extra 2 0.1242 100.20 -17317
## - log(q.tlenkm):f.paid_tolls 1 0.0590 100.13 -17312
## <none> 100.07 -17306
## - q.tip_amount:f.paid_tolls 1 0.2092 100.28 -17305
## - log(q.tlenkm):f.espeed 5 1.7873 101.86 -17267
## - q.tip_amount:f.espeed 5 1.8682 101.94 -17263
## - log(q.tlenkm):f.extra 2 1.6516 101.72 -17248
## - log(q.tlenkm):f.payment_type 2 2.7497 102.82 -17198
## - q.tip_amount:f.code_rate_id 1 3.2953 103.37 -17165
## - log(q.tlenkm):f.code_rate_id 1 25.3969 125.47 -16274
##
## Step: AIC=-17323.35
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + q.tip_amount:f.paid_tolls +
## q.tip_amount:f.espeed + q.tip_amount:f.extra + q.tip_amount:f.code_rate_id +
## log(q.tlenkm):f.paid_tolls + log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra +
## log(q.tlenkm):f.code_rate_id + log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## - q.tip_amount:f.extra 2 0.1268 100.37 -17334
## - log(q.tlenkm):f.paid_tolls 1 0.0574 100.30 -17329
## <none> 100.25 -17323
## - q.tip_amount:f.paid_tolls 1 0.2058 100.45 -17322
## - log(q.tlenkm):f.espeed 5 1.7958 102.04 -17284
## - q.tip_amount:f.espeed 5 1.8834 102.13 -17280
## - log(q.tlenkm):f.extra 2 1.6356 101.88 -17266
## - log(q.tlenkm):f.payment_type 2 2.7496 103.00 -17216
## - q.tip_amount:f.code_rate_id 1 3.3059 103.55 -17182
## - log(q.tlenkm):f.code_rate_id 1 25.3144 125.56 -16296
##
## Step: AIC=-17334.4
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + q.tip_amount:f.paid_tolls +
## q.tip_amount:f.espeed + q.tip_amount:f.code_rate_id + log(q.tlenkm):f.paid_tolls +
## log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id +
## log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## - log(q.tlenkm):f.paid_tolls 1 0.0537 100.43 -17340
## <none> 100.37 -17334
## - q.tip_amount:f.paid_tolls 1 0.2097 100.58 -17333
## - q.tip_amount:f.espeed 5 1.7712 102.14 -17296
## - log(q.tlenkm):f.espeed 5 1.7817 102.15 -17296
## - log(q.tlenkm):f.extra 2 1.8213 102.19 -17268
## - log(q.tlenkm):f.payment_type 2 2.7823 103.16 -17225
## - q.tip_amount:f.code_rate_id 1 3.3274 103.70 -17193
## - log(q.tlenkm):f.code_rate_id 1 25.4051 125.78 -16304
##
## Step: AIC=-17340.37
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + q.tip_amount:f.paid_tolls +
## q.tip_amount:f.espeed + q.tip_amount:f.code_rate_id + log(q.tlenkm):f.espeed +
## log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id + log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## - q.tip_amount:f.paid_tolls 1 0.1745 100.60 -17341
## <none> 100.43 -17340
## - q.tip_amount:f.espeed 5 1.7304 102.16 -17304
## - log(q.tlenkm):f.espeed 5 1.8561 102.28 -17298
## - log(q.tlenkm):f.extra 2 1.8241 102.25 -17274
## - log(q.tlenkm):f.payment_type 2 2.7554 103.18 -17233
## - q.tip_amount:f.code_rate_id 1 3.3149 103.74 -17199
## - log(q.tlenkm):f.code_rate_id 1 25.3540 125.78 -16313
##
## Step: AIC=-17340.81
## log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) + f.paid_tolls +
## f.espeed + f.extra + f.code_rate_id + f.payment_type + q.tip_amount:f.espeed +
## q.tip_amount:f.code_rate_id + log(q.tlenkm):f.espeed + log(q.tlenkm):f.extra +
## log(q.tlenkm):f.code_rate_id + log(q.tlenkm):f.payment_type
##
## Df Sum of Sq RSS AIC
## <none> 100.60 -17341
## - log(q.tlenkm):f.espeed 5 1.8740 102.47 -17298
## - q.tip_amount:f.espeed 5 1.9522 102.55 -17294
## - f.paid_tolls 1 1.3113 101.91 -17290
## - log(q.tlenkm):f.extra 2 1.8579 102.46 -17274
## - log(q.tlenkm):f.payment_type 2 2.7226 103.32 -17235
## - q.tip_amount:f.code_rate_id 1 3.1412 103.74 -17208
## - log(q.tlenkm):f.code_rate_id 1 25.7500 126.35 -16300
Aquest mètode ens diu que:
Anova(model_17)
## Anova Table (Type II tests)
##
## Response: log(target.total_amount)
## Sum Sq Df F value Pr(>F)
## q.tip_amount 22.42 1 1018.820 < 2.2e-16 ***
## log(q.tlenkm) 713.55 1 32428.747 < 2.2e-16 ***
## f.paid_tolls 1.31 1 59.596 1.421e-14 ***
## f.espeed 22.93 5 208.405 < 2.2e-16 ***
## f.extra 5.62 2 127.699 < 2.2e-16 ***
## f.code_rate_id 8.87 1 402.972 < 2.2e-16 ***
## f.payment_type 2.79 2 63.393 < 2.2e-16 ***
## q.tip_amount:f.espeed 1.95 5 17.744 < 2.2e-16 ***
## q.tip_amount:f.code_rate_id 3.14 1 142.756 < 2.2e-16 ***
## log(q.tlenkm):f.espeed 1.87 5 17.034 < 2.2e-16 ***
## log(q.tlenkm):f.extra 1.86 2 42.217 < 2.2e-16 ***
## log(q.tlenkm):f.code_rate_id 25.75 1 1170.261 < 2.2e-16 ***
## log(q.tlenkm):f.payment_type 2.72 2 61.867 < 2.2e-16 ***
## Residuals 100.60 4572
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model_17)
##
## Call:
## lm(formula = log(target.total_amount) ~ q.tip_amount + log(q.tlenkm) +
## f.paid_tolls + f.espeed + f.extra + f.code_rate_id + f.payment_type +
## q.tip_amount:f.espeed + q.tip_amount:f.code_rate_id + log(q.tlenkm):f.espeed +
## log(q.tlenkm):f.extra + log(q.tlenkm):f.code_rate_id + log(q.tlenkm):f.payment_type,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05558 -0.05518 -0.00962 0.05245 2.35141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.926591 0.015134 127.298 < 2e-16
## q.tip_amount 0.062247 0.005199 11.972 < 2e-16
## log(q.tlenkm) 0.619204 0.015692 39.459 < 2e-16
## f.paid_tollsYes 0.191814 0.024847 7.720 1.42e-14
## f.espeed[10,20) -0.200905 0.014350 -14.000 < 2e-16
## f.espeed[20,30) -0.286171 0.015864 -18.039 < 2e-16
## f.espeed[30,40) -0.392635 0.023457 -16.738 < 2e-16
## f.espeed[40,50) -0.668755 0.058657 -11.401 < 2e-16
## f.espeed[50,55] -0.596715 0.073621 -8.105 6.70e-16
## f.extra0.5 0.074778 0.008583 8.713 < 2e-16
## f.extra1 0.169251 0.010904 15.522 < 2e-16
## f.code_rate_idRate-Other 0.807476 0.022679 35.604 < 2e-16
## f.payment_typeCash -0.114061 0.008248 -13.829 < 2e-16
## f.payment_typeNo paid -0.303472 0.047206 -6.429 1.42e-10
## q.tip_amount:f.espeed[10,20) 0.006449 0.005720 1.127 0.2597
## q.tip_amount:f.espeed[20,30) -0.001163 0.005747 -0.202 0.8397
## q.tip_amount:f.espeed[30,40) -0.009265 0.006152 -1.506 0.1322
## q.tip_amount:f.espeed[40,50) -0.027796 0.006883 -4.039 5.47e-05
## q.tip_amount:f.espeed[50,55] -0.039137 0.007461 -5.246 1.63e-07
## q.tip_amount:f.code_rate_idRate-Other 0.089331 0.007477 11.948 < 2e-16
## log(q.tlenkm):f.espeed[10,20) -0.004650 0.015727 -0.296 0.7675
## log(q.tlenkm):f.espeed[20,30) -0.009975 0.016075 -0.621 0.5349
## log(q.tlenkm):f.espeed[30,40) 0.038537 0.017955 2.146 0.0319
## log(q.tlenkm):f.espeed[40,50) 0.155447 0.028369 5.479 4.50e-08
## log(q.tlenkm):f.espeed[50,55] 0.149001 0.032483 4.587 4.62e-06
## log(q.tlenkm):f.extra0.5 -0.045898 0.006164 -7.446 1.14e-13
## log(q.tlenkm):f.extra1 -0.063196 0.008455 -7.475 9.22e-14
## log(q.tlenkm):f.code_rate_idRate-Other -0.483411 0.014131 -34.209 < 2e-16
## log(q.tlenkm):f.payment_typeCash 0.070128 0.006313 11.109 < 2e-16
## log(q.tlenkm):f.payment_typeNo paid 0.061644 0.030379 2.029 0.0425
##
## (Intercept) ***
## q.tip_amount ***
## log(q.tlenkm) ***
## f.paid_tollsYes ***
## f.espeed[10,20) ***
## f.espeed[20,30) ***
## f.espeed[30,40) ***
## f.espeed[40,50) ***
## f.espeed[50,55] ***
## f.extra0.5 ***
## f.extra1 ***
## f.code_rate_idRate-Other ***
## f.payment_typeCash ***
## f.payment_typeNo paid ***
## q.tip_amount:f.espeed[10,20)
## q.tip_amount:f.espeed[20,30)
## q.tip_amount:f.espeed[30,40)
## q.tip_amount:f.espeed[40,50) ***
## q.tip_amount:f.espeed[50,55] ***
## q.tip_amount:f.code_rate_idRate-Other ***
## log(q.tlenkm):f.espeed[10,20)
## log(q.tlenkm):f.espeed[20,30)
## log(q.tlenkm):f.espeed[30,40) *
## log(q.tlenkm):f.espeed[40,50) ***
## log(q.tlenkm):f.espeed[50,55] ***
## log(q.tlenkm):f.extra0.5 ***
## log(q.tlenkm):f.extra1 ***
## log(q.tlenkm):f.code_rate_idRate-Other ***
## log(q.tlenkm):f.payment_typeCash ***
## log(q.tlenkm):f.payment_typeNo paid *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1483 on 4572 degrees of freedom
## Multiple R-squared: 0.9282, Adjusted R-squared: 0.9277
## F-statistic: 2037 on 29 and 4572 DF, p-value: < 2.2e-16
!!!! FALTA DIAGNOSI EXHAUSTIVA !!!
ll1<-Boxplot(rstudent(model_17));ll1
## [1] 4051 80 2621 4269 1385 3802 2666 3676 3291 3634 3021 1978 2032 2005 2711
## [16] 2434 3838 3287 2625 4524
sel2<-which(hatvalues(model_17)>5*length(model_17$coefficients)/nrow(df));sel2;length(sel2)
## 1908 14314 23421 23932 23958 24990 28982 33046 37238 41478
## 7 42 73 76 77 80 97 106 114 128
## 49078 64149 71268 71596 81949 88821 98170 101184 110979 115296
## 157 204 228 231 268 295 317 326 357 373
## 121215 125894 128613 131915 132102 154087 166154 169380 194151 201926
## 389 401 410 418 421 500 536 547 633 658
## 204903 209928 210357 210707 221913 228729 244755 244971 252056 274645
## 674 692 697 699 738 772 830 831 855 914
## 300524 316484 322178 327762 329452 360250 382504 395415 404073 415806
## 980 1024 1038 1053 1057 1150 1204 1247 1278 1319
## 423307 423839 428613 437922 443592 449320 453619 486866 487457 492805
## 1350 1354 1362 1385 1408 1427 1445 1557 1560 1574
## 516357 529475 533937 535041 542034 559358 564751 572644 575739 577950
## 1638 1682 1694 1696 1720 1774 1788 1802 1808 1816
## 590161 620293 621420 621544 625503 632100 645141 654257 657624 658738
## 1850 1954 1958 1960 1968 1990 2028 2065 2075 2080
## 663694 683052 689000 710390 724424 725701 728096 730897 730975 731288
## 2105 2183 2199 2266 2320 2324 2328 2334 2337 2339
## 735280 741591 747830 751896 771658 773934 785532 793294 794902 810930
## 2349 2378 2398 2412 2468 2475 2510 2528 2532 2572
## 825427 826623 829742 861539 881540 892761 894658 896291 920461 957227
## 2621 2625 2631 2723 2788 2837 2847 2853 2927 3035
## 965349 976822 986459 986910 1010111 1010826 1040346 1051194 1060542 1076485
## 3065 3105 3142 3147 3211 3215 3314 3353 3382 3421
## 1082823 1083301 1095371 1109089 1110005 1120203 1120401 1140092 1150441 1159509
## 3449 3453 3497 3535 3538 3566 3567 3633 3667 3696
## 1181893 1227019 1227021 1233051 1242754 1254924 1261276 1287570 1334927 1340781
## 3776 3902 3903 3919 3951 3988 4007 4089 4241 4260
## 1342604 1345546 1347654 1354552 1354822 1356261 1377906 1396114 1407546 1419545
## 4264 4269 4278 4299 4301 4305 4376 4449 4486 4524
## 1421036 1439743
## 4529 4585
## [1] 152
ll2<-which(row.names(model_17) %in% names(hatvalues(model_17)[sel2]));ll2
## integer(0)
sel3<-which(cooks.distance(model_17)> 0.5 );sel3;length(sel3)
## named integer(0)
## [1] 0
ll3<-which(row.names(df) %in% names(cooks.distance(model_17)[sel3]));ll3
## integer(0)
vars_cexp <- vars_cexp[c(1:4,6:10)]; vars_cexp
## [1] "q.passenger_count" "q.trip_distance" "q.fare_amount"
## [4] "q.extra" "q.tolls_amount" "q.hour"
## [7] "q.tlenkm" "q.traveltime" "q.espeed"
table(df$target.tip_is_given, df$f.payment_type)
##
## Credit card Cash No paid
## No 352 2484 29
## Yes 1737 0 0
We can see from the table that it is no credible the fact that any of the people that paid in cash did not leave any tip.
res.cat <- catdes(df, num.var = which(names(df)=="target.tip_is_given"))
res.cat$quanti.var
## Eta2 P-value
## q.tip_amount 0.530313236 0.000000e+00
## target.total_amount 0.062475234 1.704519e-66
## q.dropoff_longitude 0.045623769 1.241947e-48
## q.pickup_longitude 0.035898477 1.874433e-38
## q.fare_amount 0.014755168 1.353812e-16
## q.trip_distance 0.012901088 1.091013e-14
## q.tlenkm 0.012500007 2.820041e-14
## q.dropoff_latitude 0.011813680 1.432540e-13
## q.pickup_latitude 0.010850411 1.403276e-12
## q.traveltime 0.009292813 5.638316e-11
## q.espeed 0.007947848 1.376257e-09
## q.tolls_amount 0.004085851 1.427990e-05
res.cat$test.chi2
## p.value df
## f.payment_type 0.000000e+00 2
## f.cost 1.855099e-93 5
## f.dist 3.632199e-23 3
## f.trip_distance_range 2.119770e-22 2
## f.tt 7.339353e-14 4
## f.espeed 1.128783e-08 5
## f.paid_tolls 2.595115e-06 1
## qual.pickup 5.563582e-05 23
## f.period 6.473080e-05 3
## f.mta_tax 8.160062e-05 1
## f.improvement_surcharge 1.041592e-04 1
## f.trip_type 1.182591e-04 1
## qual.dropoff 3.987953e-04 23
## f.code_rate_id 5.237279e-04 1
## f.hour 4.399605e-02 6
From the quanti.var we can see that tip_is_given depends on tip_amount which seems obvious, due to the fact that they are the same variable treated in different ways.
From the test.chi2 we can see that payment_type has something really clear with the tip_is_given, as we have p-value of 0. Which means that we cannot use payment_type as a predictor.
Target binary factor tip_is_given
ll<-which(df$f.payment_type=="Cash"); length(ll)
## [1] 2484
dff<-df[-ll,]
set.seed(12345)
llwork<-sample(1:nrow(dff),0.70*nrow(dff),replace=FALSE)
llwork<-sort(llwork);length(llwork)
## [1] 1482
dffwork<-dff[llwork,]
dfftest<-dff[-llwork,]
————–MIN 28:00
Explicative Variables for modeling purposes are those available in dataset, exceptions will be indicated, if any.
Multivariant Analysis conducted in previous deliverables has to be used to select the initial model. Students have some degrees in freedom in model building, but the following conditions are requested:
When referring to the performance of a classification model, we are interested in the model’s ability to correctly predict or separate the classes. When looking at the errors made by a classification model, the confusion matrix gives the full picture. Consider e.g. a three class problem with the classes A, and B. The confusion matrix shows how the predictions are made by the model. The rows correspond to the known class of the data, i.e. the labels in the data. The columns correspond to the predictions made by the model. The value of each of element in the matrix is the number of predictions made with the class corresponding to the column for examples with the correct value as represented by the row. Thus, the diagonal elements show the number of correct classifications made for each class, and the off-diagonal elements show the errors made.